import geopandas as gpd
import pandas as pd
import altair as alt
geo_states = gpd.read_file('data/gz_2010_us_040_00_500k.json')
df_polls = pd.read_csv('data/presidential_poll_averages_2020.csv')20 IM939 - Lab 6 - Part 3 - Choropleths
A visualisation often shown is a choropleth. This is a series of spatial polygons (such as states in the USA) which are coloured by a feature. Here we will look at creating choropleths of polling data in the recent USA election.
Load in two datasets. One contains the geospatial polygons of the states in America and the other is the polling data we used in the last notebook.
Filter our poll data to a specific date.
df_nov = df_polls[
(df_polls.modeldate == '11/3/2020')
]
df_nov_states = df_nov[
(df_nov.candidate_name == 'Donald Trump') |
(df_nov.candidate_name == 'Joseph R. Biden Jr.')
]
df_nov_states| cycle | state | modeldate | candidate_name | pct_estimate | pct_trend_adjusted | |
|---|---|---|---|---|---|---|
| 0 | 2020 | Wyoming | 11/3/2020 | Joseph R. Biden Jr. | 30.81486 | 30.82599 |
| 1 | 2020 | Wisconsin | 11/3/2020 | Joseph R. Biden Jr. | 52.12642 | 52.09584 |
| 2 | 2020 | West Virginia | 11/3/2020 | Joseph R. Biden Jr. | 33.49125 | 33.51517 |
| 3 | 2020 | Washington | 11/3/2020 | Joseph R. Biden Jr. | 59.34201 | 59.39408 |
| 4 | 2020 | Virginia | 11/3/2020 | Joseph R. Biden Jr. | 53.74120 | 53.72101 |
| ... | ... | ... | ... | ... | ... | ... |
| 107 | 2020 | California | 11/3/2020 | Donald Trump | 32.28521 | 32.43615 |
| 108 | 2020 | Arkansas | 11/3/2020 | Donald Trump | 58.39097 | 58.94886 |
| 109 | 2020 | Arizona | 11/3/2020 | Donald Trump | 46.11074 | 46.10181 |
| 110 | 2020 | Alaska | 11/3/2020 | Donald Trump | 50.99835 | 51.23236 |
| 111 | 2020 | Alabama | 11/3/2020 | Donald Trump | 57.36153 | 57.36126 |
112 rows × 6 columns
The geo_states variable has polygons for each state.
geo_states.head()| GEO_ID | STATE | NAME | LSAD | CENSUSAREA | geometry | |
|---|---|---|---|---|---|---|
| 0 | 0400000US23 | 23 | Maine | 30842.923 | MULTIPOLYGON (((-67.61976 44.51975, -67.61541 ... | |
| 1 | 0400000US25 | 25 | Massachusetts | 7800.058 | MULTIPOLYGON (((-70.83204 41.60650, -70.82373 ... | |
| 2 | 0400000US26 | 26 | Michigan | 56538.901 | MULTIPOLYGON (((-88.68443 48.11579, -88.67563 ... | |
| 3 | 0400000US30 | 30 | Montana | 145545.801 | POLYGON ((-104.05770 44.99743, -104.25015 44.9... | |
| 4 | 0400000US32 | 32 | Nevada | 109781.180 | POLYGON ((-114.05060 37.00040, -114.04999 36.9... |
alt.Chart(geo_states, title='US states').mark_geoshape().encode(
).properties(
width=500,
height=300
).project(
type='albersUsa'
)We want to put the percentage estimates for each candidate onto the map. First, let us create a dataframe containing the data for each candidate.
# Create seperate date frame for trump and biden
trump_data = df_nov_states[
df_nov_states.candidate_name == 'Donald Trump'
]
biden_data = df_nov_states[
df_nov_states.candidate_name == 'Joseph R. Biden Jr.'
]Our spatial and poll data have the name of the state in common, but their columns have different names. We could rename the column so it is the same in all cases and then merge (see commented code below)
# Uncomment below to see the effect. This produces an almost identical geodataframe to code cell below, but more verbose. (Can you spot the difference?)
# Rename column names.
# trump_data.columns = ['cycle', 'NAME', 'modeldate', 'candidate_name', 'pct_estimate', 'pct_trend_adjusted']
# biden_data.columns = ['cycle', 'NAME', 'modeldate', 'candidate_name', 'pct_estimate', 'pct_trend_adjusted']
# We can join the geospatial and poll data using the NAME column (the name of the state).
# geo_states_trump = geo_states.merge(trump_data, on = 'NAME')
# geo_states_biden = geo_states.merge(biden_data, left_on = 'NAME', right_on = 'state')We can join the geospatial and poll data using different column names by using left_on for the left data (usually the geodataframe) and right_on for the right dataframe.
# Add the poll data
geo_states_trump = geo_states.merge(trump_data, left_on = 'NAME', right_on = 'state')
geo_states_biden = geo_states.merge(biden_data, left_on = 'NAME', right_on = 'state')geo_states_trump.head()| GEO_ID | STATE | NAME | LSAD | CENSUSAREA | geometry | cycle | state | modeldate | candidate_name | pct_estimate | pct_trend_adjusted | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0400000US23 | 23 | Maine | 30842.923 | MULTIPOLYGON (((-67.61976 44.51975, -67.61541 ... | 2020 | Maine | 11/3/2020 | Donald Trump | 40.34410 | 40.31588 | |
| 1 | 0400000US25 | 25 | Massachusetts | 7800.058 | MULTIPOLYGON (((-70.83204 41.60650, -70.82373 ... | 2020 | Massachusetts | 11/3/2020 | Donald Trump | 28.56164 | 28.86275 | |
| 2 | 0400000US26 | 26 | Michigan | 56538.901 | MULTIPOLYGON (((-88.68443 48.11579, -88.67563 ... | 2020 | Michigan | 11/3/2020 | Donald Trump | 43.20577 | 43.23326 | |
| 3 | 0400000US30 | 30 | Montana | 145545.801 | POLYGON ((-104.05770 44.99743, -104.25015 44.9... | 2020 | Montana | 11/3/2020 | Donald Trump | 49.74744 | 49.78661 | |
| 4 | 0400000US32 | 32 | Nevada | 109781.180 | POLYGON ((-114.05060 37.00040, -114.04999 36.9... | 2020 | Nevada | 11/3/2020 | Donald Trump | 44.32982 | 44.36094 |
geo_states_biden.head()| GEO_ID | STATE | NAME | LSAD | CENSUSAREA | geometry | cycle | state | modeldate | candidate_name | pct_estimate | pct_trend_adjusted | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0400000US23 | 23 | Maine | 30842.923 | MULTIPOLYGON (((-67.61976 44.51975, -67.61541 ... | 2020 | Maine | 11/3/2020 | Joseph R. Biden Jr. | 53.31518 | 53.32106 | |
| 1 | 0400000US25 | 25 | Massachusetts | 7800.058 | MULTIPOLYGON (((-70.83204 41.60650, -70.82373 ... | 2020 | Massachusetts | 11/3/2020 | Joseph R. Biden Jr. | 64.36328 | 64.62505 | |
| 2 | 0400000US26 | 26 | Michigan | 56538.901 | MULTIPOLYGON (((-88.68443 48.11579, -88.67563 ... | 2020 | Michigan | 11/3/2020 | Joseph R. Biden Jr. | 51.17806 | 51.15482 | |
| 3 | 0400000US30 | 30 | Montana | 145545.801 | POLYGON ((-104.05770 44.99743, -104.25015 44.9... | 2020 | Montana | 11/3/2020 | Joseph R. Biden Jr. | 45.34418 | 45.36695 | |
| 4 | 0400000US32 | 32 | Nevada | 109781.180 | POLYGON ((-114.05060 37.00040, -114.04999 36.9... | 2020 | Nevada | 11/3/2020 | Joseph R. Biden Jr. | 49.62386 | 49.65657 |
Joe Biden is clearly winning. Can we make it look like he is not?
We can plot this specifying the feature to use for our colour.
alt.Chart(geo_states_trump, title='Poll estimate for Donald Trump on 11/3/2020').mark_geoshape().encode(
color='pct_estimate',
tooltip=['NAME', 'pct_estimate']
).properties(
width=500,
height=300
).project(
type='albersUsa'
)To smooth out any differences we can bin our data.
alt.Chart(geo_states_trump, title='Poll estimate for Donald Trump on 11/3/2020').mark_geoshape().encode(
alt.Color('pct_estimate', bin=alt.Bin(step=35)),
tooltip=['NAME', 'pct_estimate']
).properties(
width=500,
height=300
).project(
type='albersUsa'
)How would you interpret the plot above?
What about if we increase the binstep so we have more bins?
alt.Chart(geo_states_trump, title='Poll estimate for Donald Trump on 11/3/2020').mark_geoshape().encode(
alt.Color('pct_estimate', bin=alt.Bin(step=5)),
tooltip=['NAME', 'pct_estimate']
).properties(
width=500,
height=300
).project(
type='albersUsa'
)Perhaps try different step sizes for the bins and consider how bins can shape our interpretation of the data. What would happen if plots with different bin sizes were placed side to side.
To add further confusion, what happens when we log scale the data?
alt.Chart(geo_states_trump, title='Poll estimate for Donald Trump on 11/3/2020').mark_geoshape().encode(
alt.Color('pct_estimate', bin=alt.Bin(step=5), scale=alt.Scale(type='log')),
tooltip=['NAME', 'pct_estimate']
).properties(
width=500,
height=300
).project(
type='albersUsa'
)vs
alt.Chart(geo_states_biden, title='Poll estimate for Joe Biden on 11/3/2020').mark_geoshape().encode(
alt.Color('pct_estimate', bin=alt.Bin(step=5), scale=alt.Scale(type='log')),
tooltip=['NAME', 'pct_estimate']
).properties(
width=500,
height=300
).project(
type='albersUsa'
)What is happening here?!?!
Next up, what about the colours we use and the range of values assigned to each color? Code inspired by/taken from here.
alt.Chart(geo_states_trump, title='Poll estimate for Donal Trump on 11/3/2020').mark_geoshape().encode(
alt.Color('pct_estimate',
scale=alt.Scale(type="linear",
domain=[10, 40, 50, 55, 60, 61, 62],
range=["#414487","#414487",
"#355f8d","#355f8d",
"#2a788e",
"#fde725","#fde725"])),
tooltip=['NAME', 'pct_estimate']
).properties(
width=500,
height=300
).project(
type='albersUsa'
)Compare that with
alt.Chart(geo_states_trump, title='Poll estimate for Donald Trump on 11/3/2020').mark_geoshape().encode(
alt.Color('pct_estimate',
scale=alt.Scale(type="linear",
domain=[10, 20, 30, 35, 68, 70, 100],
range=["#414487","#414487",
"#7ad151","#7ad151",
"#bddf26",
"#fde725","#fde725"])),
tooltip=['NAME', 'pct_estimate']
).properties(
width=500,
height=300
).project(
type='albersUsa'
)My goodness! So what have we played around with?
- Transforming our scale using log
- Binning our data to smooth out variances
- Altering our colour scheme and the ranges for each colour
… what about if we remove the legend?
alt.Chart(geo_states_trump, title='Poll estimate for Donald Trump on 11/3/2020').mark_geoshape().encode(
alt.Color('pct_estimate',
scale=alt.Scale(type="linear",
domain=[10, 20, 30, 35, 68, 70, 100],
range=["#414487","#414487",
"#7ad151","#7ad151",
"#bddf26",
"#fde725","#fde725"]),
legend=None),
tooltip=['NAME', 'pct_estimate']
).properties(
width=500,
height=300
).project(
type='albersUsa'
)Good luck trying to interpret that. Though we often see maps without legends and with questionable colour schemes on TV.
How do you think choropleths should be displayed? What information does a use need to understand the message communicated in these plots?